# coding=utf-8
# -*- coding: utf-8 -*-
import scrapy
import re
import json
from fjggfw.items import FjggfwItem
from scrapy.http import Request
import pymysql
from selenium import webdriver
from datetime import datetime,timedelta


class FjggfwspdSpider(scrapy.Spider):

    name = 'fjggfwspd'
    allowed_domains = ['ggzyfw.fj.gov.cn']
    pageSize = 1000

    now = datetime.now()
    endTime = now.strftime('%Y-%m-%d %H:%M:%S')
    diff = timedelta(days=7)
    startTime = (now - diff).strftime('%Y-%m-%d %H:%M:%S')
    cookie = ''

    def start_requests(self):
        browser = webdriver.PhantomJS()
        browser.get('https://ggzyfw.fj.gov.cn/Website/JYXXNew.aspx')

        self.cookie = browser.get_cookie("_qddagsx_02095bad0b")["value"]
        self.headers = {"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                   "Accept-Encoding": " gb2312,utf-8",
                   "Accept-Language": " zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                   "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
                   "Connection": "keep-alive",
                   "Cookie": "_qddagsx_02095bad0b=" + str(self.cookie)}
        browser.close()

        for i in range(1, 8):
            # 构造下一次要爬取的url，爬取一下页列表页中的数据
            nexturl = "https://ggzyfw.fj.gov.cn/Website/AjaxHandler/BuilderHandler.ashx?OPtype=GetListNew&pageNo="+str(i)+"&pageSize="+str(self.pageSize)+"&ProType=-1&TopTime="+self.startTime+"&endTime="+self.endTime+"&proArea=-1&category=GCJS&announcementType=1&xmlx=-1&rrr=0.16069953398867165"
            # 进行下一次爬取，下一次爬取仍然模拟成浏览器进行
            yield Request(nexturl, callback=self.parse, headers=self.headers)

    def parse(self, response):
        jsobj = json.loads(response.text)
        notices = jsobj.get('data')
        for notice in notices:
            item = FjggfwItem()
            item['industry'] = notice['PROTYPE_TEXT']
            item['area'] = notice['AREANAME']
            item['project_name'] = notice['NAME']
            item['tm'] = notice['TM'].replace('T', ' ')
            item['type'] = notice['TITLE']
            item['id'] = str(notice['M_ID']).replace('.0', '')
            item['url'] = "https://ggzyfw.fj.gov.cn/Website/JYXX_GCJS.aspx?ID=" + item["id"] + "&GGTYPE=" + notice['GGTYPE']

            url = "https://ggzyfw.fj.gov.cn/Website/AjaxHandler/BuilderHandler.ashx?OPtype=GetGGInfoPC&GGTYPE=1&ID=" + item['id']
            # 传入详情页
            yield Request(url, callback=self.parse_detail,headers=self.headers,meta={'item': item})

    def parse_detail(self, response):
        jsonobj = json.loads(response.text)
        data = jsonobj['data'][0]

        item = response.meta['item']
        item['content_text'] = filter_script_style(data)
        item['tender_time'] = str(jsonobj['node'][0]['TM']).replace('T', ' ')
        item['open_time'] = jsonobj['baseInfo'][0]['T4']
        item['trading_center'] = jsonobj['baseInfo'][0]['SYSTEM_NAME']
        yield item

def filter_script_style(htmlstr):
    p_body = re.compile('(<body|<BODY).*(</body>|</BODY>)', re.S)
    p_in_style = re.compile('style=".*?;"', re.I)
    result = p_in_style.sub('', htmlstr)
    result = re.search(p_body, result)
    if result is None:
        p_script = re.compile('(<script|<SCRIPT).*(</script>|</SCRIPT>)', re.S)
        p_style = re.compile('(<style|<STYLE).*(</style>|</STYLE>)', re.S)
        p_tbody = re.compile('<tbody>|</tbody>', re.I)
        result = p_script.sub('', htmlstr)
        result = p_style.sub('', result)
        result = p_tbody.sub('', result)
        return result
    return result.group()

def filter_tags(htmlstr):
    # 先过滤CDATA
    re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)  # Script
    re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)  # style
    re_br = re.compile('<br\s*?/?>')  # 处理换行
    re_doc=re.compile('<!\w+[^>]*>')
    re_td = re.compile('</td>')  # 处理表格
    re_span_start=re.compile('<span[^>]*>')  # 处理span
    re_span_end = re.compile('</span>')  # 处理span
    re_space = re.compile('&nbsp;')  # 处理空格
    re_space2 = re.compile('\s')  # 处理空格
    re_start = re.compile('<\w+[^>]*>')  # HTML标签
    re_end = re.compile('</\w+[^>]*>')  # HTML标签
    re_comment = re.compile('<!--[^>]*-->')  # HTML注释
    s = re_cdata.sub('', htmlstr)  # 去掉CDATA
    s = re_script.sub('', s)  # 去掉SCRIPT
    s = re_style.sub('', s)  # 去掉style
    s = re_br.sub('', s)  # 将br转换为换行
    s = re_start.sub('', s)  # 去掉HTML 标签
    s = re_end.sub('', s)  # 去掉HTML 标签
    s = re_comment.sub('', s)  # 去掉HTML注释
    s = re_space.sub('', s)
    s = re_space2.sub('', s)
    s = re_doc.sub('', s)
    # 去掉多余的空行
    blank_line = re.compile('\n+')
    s = blank_line.sub('\n', s)
    s = replaceCharEntity(s)  # 替换实体
    return s


##替换常用HTML字符实体.
# 使用正常的字符替换HTML中特殊的字符实体.
# 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
# @param htmlstr HTML字符串.
def replaceCharEntity(htmlstr):
    CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                     'lt': '<', '60': '<',
                     'gt': '>', '62': '>',
                     'amp': '&', '38': '&',
                     'quot': '"', '34': '"', }

    re_charEntity = re.compile(r'&#?(?P<name>\w+);')
    sz = re_charEntity.search(htmlstr)
    while sz:
        entity = sz.group()  # entity全称，如>
        key = sz.group('name')  # 去除&;后entity,如>为gt
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
        except KeyError:
            # 以空串代替
            htmlstr = re_charEntity.sub('', htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
    return htmlstr
